.pdf file (including any handwritten solutions scanned and merged into the PDF, if applicable)..ipynb file. In this problem, the multi-resolution Lucas-Kanade algorithm for estimating optical flow will be implemented, and the data needed for this problem can be found in the folder 'optical_flow_images'.
An example optical flow output is shown below - this is not a solution, just an example output.
Note 1: You can choose to implement a single scale version of Lucas-Kanade instead of multi-scale for partial credit in part 1.
Note 2: You are free to use either multi-scale or single-scale version to complete part 3 and part 4 without incurring any penalty.

Implement the Lucas-Kanade method for estimating optical flow. The function 'LucasKanadeMultiScale' needs to be completed. You can use 'upsample_flow' and 'OpticalFlowRefine' as 2 building blocks in order to complete this.
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import convolve, convolve2d
from scipy.ndimage.filters import gaussian_filter
from scipy.signal import convolve
from skimage.transform import resize
from scipy.signal import convolve, convolve2d
# from tqdm import tqdm_notebook
def gradient(image):
kh = np.array([[1,0,-1]])
kv = np.array([[1],[0],[-1]])
gX = convolve2d(image, kh, mode = 'same')
gY = convolve2d(image, kv, mode = 'same')
return gX, gY
def grayscale(img):
'''
Converts RGB image to Grayscale
'''
gray=np.zeros((img.shape[0],img.shape[1]))
gray=img[:,:,0]*0.2989+img[:,:,1]*0.5870+img[:,:,2]*0.1140
return gray
def plot_optical_flow(img0,img1,U,V,titleStr, color=False):
'''
Plots optical flow given U,V and the images
'''
# Change t if required, affects the number of arrows
# t should be between 1 and min(U.shape[0],U.shape[1])
t=8
# Subsample U and V to get visually pleasing output
U1 = U[::t,::t]
V1 = V[::t,::t]
# Create meshgrid of subsampled coordinates
r, c = img0.shape[0],img0.shape[1]
cols,rows = np.meshgrid(np.linspace(0,c-1,c), np.linspace(0,r-1,r))
cols = cols[::t,::t]
rows = rows[::t,::t]
# Plot optical flow
plt.figure(figsize=(20,20))
plt.subplot(121)
plt.imshow(img0, alpha=0.5)
plt.imshow(img1, alpha=0.5)
plt.title('Overlayed Images')
plt.subplot(122)
if color:
plt.imshow(img0)
else:
plt.imshow(grayscale(img0), cmap='gray')
plt.quiver(cols,rows,U1,V1)
plt.title(titleStr)
plt.show()
images=[]
for i in range(1,5):
images.append(plt.imread('optical_flow_images/im'+str(i)+'.png')[:,:288,:])
# each image after converting to gray scale is of size -> 400x288
# you can use interpolate from scipy
# You can implement 'upsample_flow' and 'OpticalFlowRefine'
# as 2 building blocks in order to complete this.
import scipy.misc
from scipy.signal import convolve2d
from skimage.transform import resize
def upsample_flow(u_prev, v_prev):
''' You may implement this method to upsample optical flow from
previous level
u_prev, v_prev -> optical flow from prev level
u, v -> upsampled optical flow to the current level
'''
if u_prev is None and v_prev is None:
return u_prev, v_prev
u = resize(u_prev,(u_prev.shape[0]*2,u_prev.shape[1]*2),order=1)
v = resize(v_prev,(u_prev.shape[0]*2,u_prev.shape[1]*2),order=1)
u = u*2;
v = v*2;
return u, v
def warp(im, u_prev, v_prev):
warpedIm = np.zeros(im.shape)
for i in range(0, im.shape[0]):
for j in range(0,im.shape[1]):
x = np.rint(i+v_prev[i,j]).astype(np.int)
y = np.rint(j+u_prev[i,j]).astype(np.int)
#x = np.rint(i+u_prev[i,j]).astype(int)
#y = np.rint(j+v_prev[i,j]).astype(int)
#if(x-i != 0): print('x: ', x-i, 'i,j = ', i,j)
#if(y-j != 0): print('y: ', y-j, 'i,j = ', i,j)
if (0<=x) and (x<im.shape[0]) and (0<=y) and (y<im.shape[1]):
warpedIm[i,j] = im[x, y]
return warpedIm
def It_patch(im2, im1, u_prev, v_prev, i, j, sub, add):
#lower and upper bounds
#i and j are the placed it is cetnered on
#shift entire patch by the u_prev and v_prev
u_window = np.rint(u_prev[i,j]).astype(np.int)
v_window = np.rint(v_prev[i,j]).astype(np.int)
#Dims = [i-sub+v_window, i+add+v_window, j-sub+u_window, j+add+u_window]
patch1 = im1[i-sub:i+add, j-sub:j+add]
patch2 = im2[i-sub+v_window:i+add+v_window,
j-sub+u_window:j+add+u_window]
It_patch = patch2 - patch1
return It_patch
def Ix2(im, dim):
gX, gY = gradient(im)
result = gX*gX
#each location in matrix has to be the summation over the entire window
#convolve window at each location
return gX, convolve2d(result, np.ones((dim,dim)), mode='same')
def Iy2(im, dim):
gX, gY = gradient(im)
result = gY*gY
return gY, convolve2d(result, np.ones((dim,dim)), mode='same')
def IxIy(im, dim):
gX, gY = gradient(im)
result = gX*gY
return convolve2d(result, np.ones((dim,dim)), mode='same')
def OpticalFlowRefine(im1,im2,window, u_prev=None, v_prev=None):
'''
Inputs: the two images at current level and window size
u_prev, v_prev - previous levels optical flow
Return u,v - optical flow at current level
'''
############################# get u_prev and v_prev
u, v = np.zeros(im1.shape), np.zeros(im1.shape)
if u_prev is None and v_prev is None:
u_prev, v_prev = np.zeros(im1.shape), np.zeros(im1.shape)
u_prev = np.round(u_prev).astype(np.int)
v_prev = np.round(v_prev).astype(np.int)
############################# get matrices of Ix^2, Iy^2, IxIy
Ix, Ix2Master = Ix2(im1, window)
Iy, Iy2Master = Iy2(im1, window)
IxIyMaster = IxIy(im1, window)
sub = window//2
add = window//2
sz = im1.shape
for i in range(sub, sz[0]-add-1 ):
for j in range(sub, sz[1]-add-1):
c = np.zeros((2,2))
d = np.zeros((2,2))
oM = np.zeros((2,1))
if window%2 == 0:
#wT - np.array(It[i-sub:i+add, j-sub:j+add])
wX = np.array(Ix[i-sub:i+add, j-sub:j+add])
wY = np.array(Iy[i-sub:i+add, j-sub:j+add])
wT = It_patch(im2, im1, u_prev, v_prev, i, j, sub, add)
else:
#wT = np.array(It[i-sub:i+add+1, j-sub:j+add+1])
wX = np.array(Ix[i-sub:i+add, j-sub:j+add])
wY = np.array(Iy[i-sub:i+add, j-sub:j+add])
wT = It_patch(im2, im1, u_prev, v_prev, i, j, sub, add)
c[0,0] = Ix2Master[i,j]
c[0,1] = IxIyMaster[i,j]
c[1,0] = IxIyMaster[i,j]
c[1,1] = Iy2Master[i,j]
#now to find the second matrix
oM[0,0] = np.sum(-wX*wT)
oM[1,0] = np.sum(-wY*wT)
#print(oM[0,0], oM[1,0])
uv = np.matmul(np.linalg.pinv(c), oM) ##PROMBLE, C IS SINGULAR
u[i,j] = uv[0]
v[i,j] = uv[1]
nothing =0
u = u + u_prev
v = v + v_prev
return u, v
def gaussian2d(filter_size=5, sig=1.0):
"""Creates a 2D Gaussian kernel with
side length `filter_size` and a sigma of `sig`."""
ax = np.arange(-filter_size // 2 + 1., filter_size // 2 + 1.)
xx, yy = np.meshgrid(ax, ax)
kernel = np.exp(-0.5 * (np.square(xx) + np.square(yy)) / np.square(sig))
return kernel / np.sum(kernel)
def downSampleIm(im1, im2, level): #send it the level 1 through..5 (level 1 is fullsize)
im1Down, im2Down = im1, im2
for i in range(level-1):
smooth1 = convolve2d(im1Down,gaussian2d(), mode='same')
smooth2 = convolve2d(im2Down,gaussian2d(), mode='same')
im1Down = smooth1[::2,::2]
im2Down = smooth2[::2,::2]
return im1Down, im2Down
def LucasKanadeMultiScale(im1,im2,window, numLevels=2):
'''
Implement the multi-resolution Lucas kanade algorithm
Inputs: the two images, window size and number of levels
if numLevels = 1, then compute optical flow at only the given image level.
Returns: u, v - the optical flow
'''
""" ==========
YOUR CODE HERE
========== """
# You can call OpticalFlowRefine iteratively
im1Down, im2Down = downSampleIm(im1, im2, numLevels)
u, v = OpticalFlowRefine(im1Down, im2Down, window)
#u_prev, v_prev = u, v
for lvl in range(numLevels -1): #starts at 0, goes through running on numLevels-2
#u_prev, v_prev = u, v
u_prev, v_prev = upsample_flow(u, v)
im1Down, im2Down = downSampleIm(im1, im2, numLevels-lvl-1)
u, v = OpticalFlowRefine(im1Down, im2Down, window, u_prev, v_prev)
#u, v = u_prev+u_new, v_prev+v_new
return u, v
Plot optical flow for the pair of images im1 and im2 for different number of levels mentioned below. Comment on the results and justify.
(i) window size = 13, numLevels = 1
(ii) window size = 13, numLevels = 3
(iii) window size = 13, numLevels = 5
So, you are expected to provide 3 outputs here
Note: if numLevels = 1, then it means the optical flow is only computed at the image resolution i.e. no downsampling
# Example code to generate output
window=13
numLevels=1
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window))
numLevels=3
# Plot
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window))
numLevels=5
# Plot
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window))
window=13
numLevels=5
# Plot
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window))
As the number of levels increases, the optical flow outputted becomes better. By better it is meant that there is less extraneous noise in the form of abnormally long arrows, as well as each of the arrows agree with the other arrows around it, showing a consistent optical flow is plausible for the object to have undergone. At numLevels =1 the image is almost useless as far as determining the optical flow of the object. At numLevels =3, there is still noise but a human can logically pick out and ignore the noise and thus get a good idea of the true optical flow. At numLevels =5, almost all the noise is gone and the arrows agree with each other, demonstrating a consistent optical flow of the object in the image. This optical flow agrees with the movement that would be inferred by looking at pictures 1 and 2. The reason for the improvement in optical flow as the number of levels increases is because at higher levels, the optical flow is found first for a very coarse image (the higher the level the coarser the image), and then refined with finer and finer images with each level, giving the consistent result seen in numLevels =5, as opposed to the noisey result seen in numLevels =3.
Plot optical flow for the pair of images im1 and im2 for at least 3 different window sizes which leads to observable difference in the results. Comment on the effect of window size on results and justify. For this part fix the number of levels to be 3 for multi-scale Lucas-Kanade.
# Example code, change as required
numLevels=3
w1, w2, w3 = 15, 25, 35
for window in [w1, w2, w3]:
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1], U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window))
For Part 3, numLevels is fixed at 3, giving decent results, and the window size is varied. The smallest window size, 15, gives the result with the highest noise and least consistent flow, window size 25 is in the middle as far as noise and flow consistentcy goes, and window size of 35 gives the lowest noise and highest consistency. This is expected of window size, as a larger window takes more of the pixels surrounding the pixel of interest into account, resulting in a sort of smoothing effect in the optical flow.
However, these are not the only determining factors of quality, as accuracy of the flow direction is important as well. The smallest window size has arrows that, though noisey, seem to point in the most exact angle. What is meant by this is that as window size is increased, the arrow direction appears to be more and more discretized, with window size of 35 having most of its arrows seem to point in angles in increments of 45 degrees. This results in a smoother flow but may be potentially less accurate. This is expected for the reasons stated above: a larger window will result in a smoothed optical flow, while a smaller window will improve the details.
This reasoning makes sense, as the algorithm makes the assumption that all pixels a given window have the same velocity, so clearly a larger window will force more of them to have that same velocity, basically smoothing. With this information, the window size can be chosen for the application; applications requiring a smooth flow can use a larger window size and applications requiring high detail in the flow can use a smaller window size.
Find optical flow for the pairs (im1,im2), (im1,im3), (im1,im4) using one good window size and number of levels. Does the optical flow result seem consistent with visual inspection? Comment on the type of motion indicated by results and visual inspection and explain why they might be consistent or inconsistent.
# Your code here
# use one fixed window and numLevels for all pairs
# Example code to generate output
window=25
numLevels=5
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 2')
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[2]),\
window,numLevels)
plot_optical_flow(images[0],images[2],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 3')
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[3]),\
window,numLevels)
plot_optical_flow(images[0],images[3],U,-V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 4')
For image 1 and image 2, the visual inspection indicates that the object moved to the left and up. The optical flow shows that the image did indeed move up and to the left with the arrows pointing in a consistent flow in the direction. So motion is consistent with optical flow results.
For image 1 and image 3, the object undergoes a small clockwise rotation. This is picked up in the optical flow, as all arrows on the object rotate clockwise as well. The center of rotation is clearly visible on the optical flow as well. The background also shows some clockwise rotation on the optical flow, likely the result of slight discoloring of the surface the object was sittting on. Overall this is good optical flow results.
For image 1 and image 4, the object is further away, likely zoomed out on. The optical flow shows this as the arrows point inwards towards the point on the image that stayed unchanged as the object was zoomed out on. This indicates good optical flow results.
In this problem, we extend optical flow to RGB images.
In lecture, we used brightness constancy constraint and taylor series expansion to come up with the optical flow equation $$I_xu +I_yv+I_t = 0$$ and we derived the Lucas-Kanade least squares solution assuming same velocity over a patch as:
$$ \left(\begin{array}{cc} \sum I_x^2 & \sum I_xI_y\\ \sum I_xI_y & \sum I_y^2 \end{array}\right) \left(\begin{array}{c} u \\ v \end{array}\right) = - \left(\begin{array}{c} \sum I_xI_t\\ \sum I_yI_t \end{array}\right) $$Derive similar equation for RGB images under a color constancy constraint by applying the brightness constancy constraint to each of the channels, and formulating a cost function over the thee channels.
image = plt.imread('FullSizeRender.jpeg')
plt.figure(figsize=(20, 20))
plt.imshow(image)
plt.show()
Complete the functions LucasKanadeMultiScaleRGB and OpticalFlowRefineRGB and run it for window size = 13 and number of levels = 3
Note: You can implement the single scale version without incurring any penalty.
def warp(im, u_prev, v_prev):
warpedIm = np.zeros(im.shape)
for i in range(0, im.shape[0]):
for j in range(0,im.shape[1]):
x = np.rint(i+v_prev[i,j]).astype(np.int)
y = np.rint(j+u_prev[i,j]).astype(np.int)
#x = np.rint(i+u_prev[i,j]).astype(int)
#y = np.rint(j+v_prev[i,j]).astype(int)
#if(x-i != 0): print('x: ', x-i, 'i,j = ', i,j)
#if(y-j != 0): print('y: ', y-j, 'i,j = ', i,j)
if (0<=x) and (x<im.shape[0]) and (0<=y) and (y<im.shape[1]):
warpedIm[i,j] = im[x, y]
return warpedIm
def It_patch(im2, im1, u_prev, v_prev, i, j, sub, add):
#lower and upper bounds
#i and j are the placed it is cetnered on
#shift entire patch by the u_prev and v_prev
u_window = np.rint(u_prev[i,j]).astype(np.int)
v_window = np.rint(v_prev[i,j]).astype(np.int)
#Dims = [i-sub+v_window, i+add+v_window, j-sub+u_window, j+add+u_window]
patch1 = im1[i-sub:i+add, j-sub:j+add]
patch2 = im2[i-sub+v_window:i+add+v_window,
j-sub+u_window:j+add+u_window]
It_patch = patch2 - patch1
return It_patch
def Ix2(im, dim):
gX, gY = gradient(im)
result = gX*gX
#each location in matrix has to be the summation over the entire window
#convolve window at each location
return gX, convolve2d(result, np.ones((dim,dim)), mode='same')
def Iy2(im, dim):
gX, gY = gradient(im)
result = gY*gY
return gY, convolve2d(result, np.ones((dim,dim)), mode='same')
def IxIy(im, dim):
gX, gY = gradient(im)
result = gX*gY
return convolve2d(result, np.ones((dim,dim)), mode='same')
def OpticalFlowRefine_RGB(im1,im2,window, u_prev=None, v_prev=None):
'''
Inputs: the two images at current level and window size
u_prev, v_prev - previous levels optical flow
Return u,v - optical flow at current level
'''
im1_0, im2_0 = im1[:,:,0], im2[:,:,0]
im1_1, im2_1 = im1[:,:,1], im2[:,:,1]
im1_2, im2_2 = im1[:,:,2], im2[:,:,2]
############################# get u_prev and v_prev
u, v = np.zeros(im1_0.shape), np.zeros(im1_0.shape)
if u_prev is None and v_prev is None:
u_prev, v_prev = np.zeros(im1_0.shape), np.zeros(im1_0.shape)
u_prev = np.round(u_prev).astype(np.int)
v_prev = np.round(v_prev).astype(np.int)
############################# get matrices of Ix^2, Iy^2, IxIy
Ix_0, Ix2Master_0 = Ix2(im1_0, window)
Iy_0, Iy2Master_0 = Iy2(im1_0, window)
IxIyMaster_0 = IxIy(im1_0, window)
Ix_1, Ix2Master_1 = Ix2(im1_1, window)
Iy_1, Iy2Master_1 = Iy2(im1_1, window)
IxIyMaster_1 = IxIy(im1_1, window)
Ix_2, Ix2Master_2 = Ix2(im1_2, window)
Iy_2, Iy2Master_2 = Iy2(im1_2, window)
IxIyMaster_2 = IxIy(im1_2, window)
sub = window//2
add = window//2
sz = im1.shape
for i in range(sub, sz[0]-add-1 ):
for j in range(sub, sz[1]-add-1):
c = np.zeros((2,2))
d = np.zeros((2,2))
oM = np.zeros((2,1))
#if window%2 == 0:
# #wT - np.array(It[i-sub:i+add, j-sub:j+add])
# wX = np.array(Ix[i-sub:i+add, j-sub:j+add])
# wY = np.array(Iy[i-sub:i+add, j-sub:j+add])
# wT = It_patch(im2, im1, u_prev, v_prev, i, j, sub, add)
#else:
#wT = np.array(It[i-sub:i+add+1, j-sub:j+add+1])
wX_0 = np.array(Ix_0[i-sub:i+add, j-sub:j+add])
wY_0 = np.array(Iy_0[i-sub:i+add, j-sub:j+add])
wT_0 = It_patch(im2_0, im1_0, u_prev, v_prev, i, j, sub, add)
wX_1 = np.array(Ix_1[i-sub:i+add, j-sub:j+add])
wY_1 = np.array(Iy_1[i-sub:i+add, j-sub:j+add])
wT_1 = It_patch(im2_1, im1_1, u_prev, v_prev, i, j, sub, add)
wX_2 = np.array(Ix_2[i-sub:i+add, j-sub:j+add])
wY_2 = np.array(Iy_2[i-sub:i+add, j-sub:j+add])
wT_2 = It_patch(im2_2, im1_2, u_prev, v_prev, i, j, sub, add)
c[0,0] = Ix2Master_0[i,j] + Ix2Master_1[i,j] + Ix2Master_2[i,j]
c[0,1] = IxIyMaster_0[i,j] + IxIyMaster_1[i,j] + IxIyMaster_2[i,j]
c[1,0] = IxIyMaster_0[i,j] + IxIyMaster_1[i,j] + IxIyMaster_2[i,j]
c[1,1] = Iy2Master_0[i,j] + Iy2Master_1[i,j] + Iy2Master_2[i,j]
#now to find the second matrix
oM[0,0] = np.sum(-wX_0*wT_0) + np.sum(-wX_1*wT_1) + np.sum(-wX_2*wT_2)
oM[1,0] = np.sum(-wY_0*wT_0) + np.sum(-wY_1*wT_1) + np.sum(-wY_2*wT_2)
#print(oM[0,0], oM[1,0])
uv = np.matmul(np.linalg.pinv(c), oM) ##PROMBLE, C IS SINGULAR
u[i,j] = uv[0]
v[i,j] = uv[1]
nothing =0
u = u + u_prev
v = v + v_prev
return u, v
def gaussian2d(filter_size=5, sig=1.0):
"""Creates a 2D Gaussian kernel with
side length `filter_size` and a sigma of `sig`."""
ax = np.arange(-filter_size // 2 + 1., filter_size // 2 + 1.)
xx, yy = np.meshgrid(ax, ax)
kernel = np.exp(-0.5 * (np.square(xx) + np.square(yy)) / np.square(sig))
return kernel / np.sum(kernel)
def downSampleImRGB(im1, im2, level): #send it the level 1 through..5 (level 1 is fullsize)
im1Down_0, im2Down_0 = im1[:,:,0], im2[:,:,0]
im1Down_1, im2Down_1 = im1[:,:,1], im2[:,:,1]
im1Down_2, im2Down_2 = im1[:,:,2], im2[:,:,2]
if level == 1:
im1DownF = im1
im2DownF = im2
for i in range(level-1):
smooth1_0 = convolve2d(im1Down_0,gaussian2d(), mode='same')
smooth2_0 = convolve2d(im2Down_0,gaussian2d(), mode='same')
im1Down_0 = smooth1_0[::2,::2]
im2Down_0 = smooth2_0[::2,::2]
smooth1_1 = convolve2d(im1Down_1,gaussian2d(), mode='same')
smooth2_1 = convolve2d(im2Down_1,gaussian2d(), mode='same')
im1Down_1 = smooth1_1[::2,::2]
im2Down_1 = smooth2_1[::2,::2]
smooth1_2 = convolve2d(im1Down_2,gaussian2d(), mode='same')
smooth2_2 = convolve2d(im2Down_2,gaussian2d(), mode='same')
im1Down_2 = smooth1_2[::2,::2]
im2Down_2 = smooth2_2[::2,::2]
im1DownF = np.zeros((im1Down_0.shape[0], im1Down_0.shape[1], 3))
im2DownF = np.zeros((im2Down_0.shape[0], im2Down_0.shape[1], 3))
im1DownF[:,:,0] = im1Down_0
im2DownF[:,:,0] = im2Down_0
im1DownF[:,:,1] = im1Down_1
im2DownF[:,:,1] = im2Down_1
im1DownF[:,:,2] = im1Down_2
im2DownF[:,:,2] = im2Down_2
return im1DownF, im2DownF
def LucasKanadeMultiScaleRGB(im1,im2,window, numLevels=2):
'''
Implement the multi-resolution Lucas kanade algorithm
Inputs: the two images, window size and number of levels
if numLevels = 1, then compute optical flow at only the given image level.
Returns: u, v - the optical flow
'''
""" ==========
YOUR CODE HERE
========== """
# You can call OpticalFlowRefine iteratively
im1Down, im2Down = downSampleImRGB(im1, im2, numLevels)
u, v = OpticalFlowRefine_RGB(im1Down, im2Down, window)
#u_prev, v_prev = u, v
for lvl in range(numLevels -1): #starts at 0, goes through running on numLevels-2
#u_prev, v_prev = u, v
u_prev, v_prev = upsample_flow(u, v)
im1Down, im2Down = downSampleImRGB(im1, im2, numLevels-lvl-1)
u, v = OpticalFlowRefine_RGB(im1Down, im2Down, window, u_prev, v_prev)
#u, v = u_prev+u_new, v_prev+v_new
return u, v
# Your code here
# use one fixed window and numLevels for all pairs
# Example code to generate output
window=13
numLevels=3
U,V=LucasKanadeMultiScaleRGB(images[0][:,:,:3],images[1][:,:,:3],\
window,numLevels)
plot_optical_flow(images[0],images[1],U,V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 2', color=True)
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[1]),\
window,numLevels)
plot_optical_flow(images[0],images[1],U,V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 2 grayscale')
# Your code here
# use one fixed window and numLevels for all pairs
# Example code to generate output
window=13
numLevels=3
U,V=LucasKanadeMultiScaleRGB(images[0][:,:,:3],images[2][:,:,:3],\
window,numLevels)
plot_optical_flow(images[0],images[2],U,V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 3', color=True)
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[2]),\
window,numLevels)
plot_optical_flow(images[0],images[2],U,V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 3 grayscale')
# Your code here
# use one fixed window and numLevels for all pairs
# Example code to generate output
window=13
numLevels=3
U,V=LucasKanadeMultiScaleRGB(images[0][:,:,:3],images[3][:,:,:3],\
window,numLevels)
plot_optical_flow(images[0],images[3],U,V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 4', color=True)
U,V=LucasKanadeMultiScale(grayscale(images[0]),grayscale(images[3]),\
window,numLevels)
plot_optical_flow(images[0],images[3],U,V, \
'levels = ' + str(numLevels) + ', window = '+str(window) + ', image 1 and image 4 grayscale')
Comment on differences in outputs of the RGB optical flow with the grayscale version
As only 3 levels were used, the results of optical flow are not as good as they'd be with 5 levels. Across all 3 sets of images, the RGB optical flow performs better. There is less noise in the form of long arrows pointing in the wrong direction on RGB than there is seen on the grayscale version. This implies more consistency pixel to pixel in the optical flow determined, which is likely the result of having more information in the form of 3 color channels as opposed to 1. RGB, though an improvement, was not a big imporvement considering the added runtime. The greyscale version of the algorithm performs better than the RGB version when given more levels than the RGB version. It also performs faster, owing to the fact that it has more levels, but only 1 channel instead of 3. Therefore the RGB solution does not give the best results in terms of the quality - runtime trade off. This is likely to do with the fact that though RGB has 3 channels, the channels are high correlated with each other, limiting the amount of new information that can be used in optical flow.
In this problem, you will implement several machine learning solutions for computer vision problems.
Follow the directions on https://pytorch.org/get-started/locally/ to install Pytorch on your computer.
Note: You will not need GPU support for this assignment so don't worry if you don't have one. Furthermore, installing with GPU support is often more difficult to configure so it is suggested that you install the CPU only version. TA's will not provide any support related to GPU or CUDA.
Run the torch import statements below to verify your instalation.
Download the MNIST data from http://yann.lecun.com/exdb/mnist/.
Download the 4 zipped files, extract them into one folder, and change the variable 'path' in the code below. (Code taken from https://gist.github.com/akesling/5358964 )
Plot one random example image corresponding to each label from training data.
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
x = torch.rand(5, 5)
print(x)
import os
BASE_DIR = os.path.abspath('MNIST')
print(BASE_DIR)
import os
import struct
# Change path as required
path = "./mnist/"
def read(dataset = "training", datatype='images'):
"""
Python function for importing the MNIST data set. It returns an iterator
of 2-tuples with the first element being the label and the second element
being a numpy.uint8 2D array of pixel data for the given image.
"""
if dataset is "training":
fname_img = os.path.join(path, 'train-images.idx3-ubyte')
fname_lbl = os.path.join(path, 'train-labels.idx1-ubyte')
elif dataset is "testing":
fname_img = os.path.join(path, 't10k-images.idx3-ubyte')
fname_lbl = os.path.join(path, 't10k-labels.idx1-ubyte')
# Load everything in some numpy arrays
with open(fname_lbl, 'rb') as flbl:
magic, num = struct.unpack(">II", flbl.read(8))
lbl = np.fromfile(flbl, dtype=np.int8)
with open(fname_img, 'rb') as fimg:
magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)
if(datatype=='images'):
get_data = lambda idx: img[idx]
elif(datatype=='labels'):
get_data = lambda idx: lbl[idx]
# Create an iterator which returns each image in turn
for i in range(len(lbl)):
yield get_data(i)
trainData=np.array(list(read('training','images')))
trainLabels=np.array(list(read('training','labels')))
testData=np.array(list(read('testing','images')))
testLabels=np.array(list(read('testing','labels')))
# Understand the shapes of the each variable carying data
print(trainData.shape, trainLabels.shape)
print(testData.shape, testLabels.shape)
# display one image from each label
""" ==========
YOUR CODE HERE
========== """
displayed_already = []
sz = trainData.shape
for i in range(0, sz[0]):
label = trainLabels[i]
if label not in displayed_already:
print('Image with label: ', label)
plt.imshow(trainData[i, :, :], cmap = 'binary')
plt.show()
displayed_already.append(label)
numShown = len(displayed_already)
if numShown >= 10:
print('All labels displayed.')
break
Some helper functions are given below.
# a generator for batches of data
# yields data (batchsize, 28, 28) and labels (batchsize)
# if shuffle, it will load batches in a random order
from tqdm import tqdm
def DataBatch(data, label, batchsize, shuffle=True):
n = data.shape[0]
if shuffle:
index = np.random.permutation(n)
else:
index = np.arange(n)
for i in range(int(np.ceil(n/batchsize))):
inds = index[i*batchsize : min(n,(i+1)*batchsize)]
yield data[inds], label[inds]
# tests the accuracy of a classifier
def test(testData, testLabels, classifier):
batchsize=50
correct=0.
for data,label in tqdm(DataBatch(testData,testLabels,batchsize,shuffle=False)):
prediction = classifier(data)
correct += np.sum(prediction==label)
return correct/testData.shape[0]*100
# a sample classifier
# given an input it outputs a random class
class RandomClassifier():
def __init__(self, classes=10):
self.classes=classes
def __call__(self, x):
return np.random.randint(self.classes, size=x.shape[0])
randomClassifier = RandomClassifier()
print('Random classifier accuracy: %f' %
test(testData, testLabels, randomClassifier))
Here you will implement a function that computes the confusion matrix for a classifier. The matrix (M) should be nxn where n is the number of classes. Entry M[i,j] should contain the fraction of images of class i that was classified as class j. Can you justify the accuracy given by the random classifier?
# Using the tqdm module to visualize run time is suggested
from tqdm import tqdm
# It would be a good idea to return the accuracy, along with the confusion
# matrix, since both can be calculated in one iteration over test data, to
# save time
def Confusion(testData, testLabels, classifier):
M=np.zeros((10,10))
acc=0.0
""" ==========
YOUR CODE HERE
========== """
classes = [i for i in range(0,10)]
numClasses, batchsize = len(classes), 50 #from test above
confMat = np.zeros((numClasses, numClasses))
for data, label in tqdm(DataBatch(testData, testLabels, batchsize)):
pred = classifier(data)
for i in range(batchsize):
predInd = np.where(classes == pred[i])
actInd = np.where(classes == label[i])
confMat[predInd, actInd] += 1
'''
pred = classifier(testData)
for i in range(len(pred)):
predInd = np.where(classes == pred[i])
actInd = np.where(classes == testLabels[i])
confMat[predInd, actInd] += 1
'''
for i in range(0, numClasses):
confMat[i] = confMat[i]/np.sum(confMat[i])
acc =np.trace(confMat)/numClasses
M = confMat
return M, acc
def VisualizeConfusion(M):
plt.figure(figsize=(14, 6))
plt.imshow(M)
plt.show()
print(np.round(M,2))
M_ran,acc_ran = Confusion(testData, testLabels, randomClassifier)
VisualizeConfusion(M_ran)
print('\n Accuracy: ', acc_ran)
The accuracy of the random classifier is shown above to be 9.9% for one run. Over many runs it averages about 10% accuracy. This makes perfect sense, as there are 10 classes, and the classifier is random, so there is a 1 in 10 chance, or 10%, to pick the correct class for a given piece of data. Looking at the confusion matrix, each row sums to 1, which makes sense because the row entries represent probabilities, and each cell is around 0.1, which again makes sense because it is a random classifier, so each of the cells have a 10% chance of being chosen as the label by the classifier.
from sklearn.neighbors import KNeighborsClassifier
class KNNClassifer():
def __init__(self, k=3):
# k is the number of neighbors involved in voting
""" ==========
YOUR CODE HERE
========== """
self.classifier = KNeighborsClassifier(n_neighbors=k)
def train(self, trainData, trainLabels):
""" ==========
YOUR CODE HERE
========== """
#trains with [n_samples, n_features]
#flatten data
sz = trainData.shape
flatData = trainData.reshape(sz[0], sz[1]*sz[2])
self.classifier.fit(flatData, trainLabels)
def __call__(self, x):
# this method should take a batch of images
# and return a batch of predictions
""" ==========
YOUR CODE HERE
========== """
sz = x.shape
flatData = x.reshape(sz[0], sz[1]*sz[2])
pred = self.classifier.predict(flatData)
return pred
# test your classifier with only the first 100 training examples (use this
# while debugging)
# note you should get ~ 65 % accuracy
knnClassiferX = KNNClassifer()
knnClassiferX.train(trainData[:100], trainLabels[:100])
print ('KNN classifier accuracy: %f'%test(testData, testLabels, knnClassiferX))
# test your classifier with all the training examples (This may take a while)
knnClassifer = KNNClassifer()
knnClassifer.train(trainData, trainLabels)
print('done')
# display confusion matrix for your KNN classifier with all the training examples
# (This may take a while)
""" ==========
YOUR CODE HERE
========== """
M_ran,acc_ran = Confusion(testData, testLabels, knnClassifer)
VisualizeConfusion(M_ran)
print('\n Accuracy: ', acc_ran)
Here the accuracy is 97%, much better than random, and the confusion matrix reflects that as its diagonal entries, repesenting correct classifications, are all around 0.97. Looking at the confusion matrix, the number 2 is most often predicted erroneously to be 1 or 7, likely because they resemble the shape of the 2. Note that it isnt necessarily visible in the confusion matrix since that matrix only goes to 2 significant values.
Here you will implement a simple KNN classifer in PCA space (for k=3 and 25 principal components). You should implement PCA yourself using svd (you may not use sklearn.decomposition.PCA or any other package that directly implements PCA transformations
Is the testing time for PCA KNN classifier more or less than that for KNN classifier? Comment on why it differs if it does.
class PCAKNNClassifer():
def __init__(self, components=25, k=3):
# components = number of principal components
# k is the number of neighbors involved in voting
""" ==========
YOUR CODE HERE
========== """
self.classifier = KNeighborsClassifier(n_neighbors=k)
self.comp, self.k =components, k
def train(self, trainData, trainLabels):
""" ==========
YOUR CODE HERE
========== """
sz = trainData.shape
flatData = trainData.reshape(sz[0], sz[1]*sz[2])
#print(np.transpose(flatData.shape))
convariance = np.cov(np.transpose(flatData))
#print(convariance.shape)
# features in cols
#s,u,v = np.linalg.svd(convariance, full_matrices = False)
s,u,v = np.linalg.svd(convariance)
#print(v.shape)
W = np.transpose(v[:self.comp])
newData = np.dot(flatData, W)
self.classifier.fit(newData, trainLabels)
self.W = W
def __call__(self, x):
# this method should take a batch of images
# and return a batch of predictions
""" ==========
YOUR CODE HERE
========== """
sz = x.shape
flatData = x.reshape(sz[0], sz[1]*sz[2])
newData = np.dot(flatData, self.W)
pred = self.classifier.predict(newData)
return pred
# test your classifier with only the first 100 training examples (use this
# while debugging)
pcaknnClassiferX = PCAKNNClassifer()
pcaknnClassiferX.train(trainData[:100], trainLabels[:100])
print ('KNN classifier accuracy: %f'%test(testData, testLabels, pcaknnClassiferX))
# test your classifier with all the training examples
pcaknnClassifer = PCAKNNClassifer()
pcaknnClassifer.train(trainData, trainLabels)
# display confusion matrix for your PCA KNN classifier with all the training examples
""" ==========
YOUR CODE HERE
========== """
M_ran,acc_ran = Confusion(testData, testLabels, pcaknnClassifer)
VisualizeConfusion(M_ran)
print('\n Accuracy: ', acc_ran)
PCA-KNN took 16 seconds to train, KNN took 11 minutes to train. (Though this may be partly due to my laptop's CPU facing thermal throttling due to its workload at training time. Never the less, KNN took significantly longer to train.) Clearly, PCA-KNN is signficantly faster to train while stilla acheiving nearly the same accuracy, due to PCA reducing the dimensions but still keeping a large percentage of the information since PCA takes the principle components that have the largest significance. This shows that PCA-KNN is an effective method to reduce training time.
Below is some helper code to train your deep networks.
Below is some helper code to train your deep networks. Complete the train function for DNN below. You should write down the training operations in this function. That means, for a batch of data you have to initialize the gradients, forward propagate the data, compute error, do back propagation and finally update the parameters. This function will be used in the following questions with different networks. You can look at https://pytorch.org/tutorials/beginner/pytorch_with_examples.html for reference.
# base class for your deep neural networks. It implements the training loop (train_net).
# You will need to implement the "__init__()" function to define the networks
# structures and "forward()", to propagate your data, in the following problems.
import torch.nn.init
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from tqdm import tqdm
from scipy.stats import truncnorm
class DNN(nn.Module):
def __init__(self):
super(DNN, self).__init__()
pass
def forward(self, x):
raise NotImplementedError
def train_net(self, trainData, trainLabels, epochs=1, batchSize=50):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(self.parameters(), lr = 3e-4)
for epoch in range(epochs):
self.train() # set netowrk in training mode
for i, (data,labels) in enumerate(DataBatch(trainData, trainLabels, batchSize, shuffle=True)):
data = Variable(torch.FloatTensor(data))
labels = Variable(torch.LongTensor(labels))
optimizer.zero_grad()
outputs = self.forward(data)
loss = criterion(outputs,labels)
loss.backward()
optimizer.step()
self.eval() # set network in evaluation mode
print ('Epoch:%d Accuracy: %f'%(epoch+1, test(testData, testLabels, self)))
def __call__(self, x):
inputs = Variable(torch.FloatTensor(x))
prediction = self.forward(inputs)
return np.argmax(prediction.data.cpu().numpy(), 1)
# helper function to get weight variable
def weight_variable(shape):
initial = torch.Tensor(truncnorm.rvs(-1/0.01, 1/0.01, scale=0.01, size=shape))
return Parameter(initial, requires_grad=True)
# helper function to get bias variable
def bias_variable(shape):
initial = torch.Tensor(np.ones(shape)*0.1)
return Parameter(initial, requires_grad=True)
# example linear classifier - input connected to output
# you can take this as an example to learn how to extend DNN class
class LinearClassifier(DNN):
def __init__(self, in_features=28*28, classes=10):
super(LinearClassifier, self).__init__()
# in_features=28*28
self.weight1 = weight_variable((classes, in_features))
self.bias1 = bias_variable((classes))
def forward(self, x):
# linear operation
y_pred = torch.addmm(self.bias1, x.view(list(x.size())[0], -1), self.weight1.t())
return y_pred
trainData=np.array(list(read('training','images')))
trainData=np.float32(np.expand_dims(trainData,-1))/255
trainData=trainData.transpose((0,3,1,2))
trainLabels=np.int32(np.array(list(read('training','labels'))))
testData=np.array(list(read('testing','images')))
testData=np.float32(np.expand_dims(testData,-1))/255
testData=testData.transpose((0,3,1,2))
testLabels=np.int32(np.array(list(read('testing','labels'))))
# test the example linear classifier (note you should get around 90% accuracy
# for 10 epochs and batchsize 50)
linearClassifier = LinearClassifier()
linearClassifier.train_net(trainData, trainLabels, epochs=10)
print ('Linear classifier accuracy: %f'%test(testData, testLabels, linearClassifier))
# display confusion matrix
""" ==========
YOUR CODE HERE
========== """
M_ran,acc_ran = Confusion(testData, testLabels, linearClassifier)
VisualizeConfusion(M_ran)
print('\n Accuracy: ', acc_ran)
The simple linear classifier implemented in the cell already performs quite well. Plot the filter weights corresponding to each output class (weights, not biases) as images. (Normalize weights to lie between 0 and 1 and use color maps like 'inferno' or 'plasma' for good results). Comment on what the weights look like and why that may be so.
# Plot filter weights corresponding to each class, you may have to reshape them to make sense out of them
# linearClassifier.weight1.data will give you the first layer weights
""" ==========
YOUR CODE HERE
========== """
weights = linearClassifier.weight1.data
print(weights.shape)
for i in range (weights.shape[0]):
print("Neuron: ", i)
weight = weights[i, :]
wMax, wMin = weight.max(), weight.min()
weight = (weight-wMin)/(wMax-wMin)
#print(weight.shape)
weight = weight.reshape(28,28)
plt.imshow(weight, cmap = 'inferno')
plt.show()
The weights somewhat resemble the class they are representing, some more than others. For example, the weights for 0, 2, and 3 clearly show evidence visually of the number. The strangest is class 5, as it bears almost no resemblence, and class 4, which again does not really look like the number. What's interesting is that class 5 has low accuracy of 0.9, but class 8 has the lowest at 0.87, despite looking somewhat like an 8. Clearly the visual appearance of the weights play some role, but the weights and accuracy's have more to them than just visuals.
Here you will implement an MLP. The MLP should consist of 2 layers (matrix multiplication and bias offset) that map to the following feature dimensions:
hidden -> classes
The hidden layer should be followed with a ReLU nonlinearity. The final layer should not have a nonlinearity applied as we desire the raw logits output.
Display the confusion matrix and accuracy after training. Note: You should get ~ 97 % accuracy for 10 epochs and batch size 50.
Plot the filter weights corresponding to the mapping from the inputs to the first 10 hidden layer outputs (out of 100). Do the weights look similar to the weights plotted in the previous problem? Why or why not?
class MLPClassifer(DNN):
def __init__(self, in_features=28*28, classes=10, hidden=100):
super(MLPClassifer, self).__init__()
""" ==========
YOUR CODE HERE
========== """
#2 layers means need to weights, and 2 bias
self.feat = in_features
self.numClasses = classes
self.hidden = hidden
#wShape1 = (in_features, hidden)
#wShape2 = (hidden, classes)
self.wLayer1 = weight_variable((in_features, hidden))
self.wLayer2 = weight_variable((hidden, classes))
self.biasInt1 = bias_variable((hidden))
self.biasInt2 = bias_variable((classes))
def forward(self, x):
""" ==========
YOUR CODE HERE
========== """
sz = x.shape
x = x.reshape((sz[0], self.feat))
x = F.relu(x@self.wLayer1 +self.biasInt1)
self.y = x@self.wLayer2 + self.biasInt2
return self.y
mlpClassifer = MLPClassifer()
mlpClassifer.train_net(trainData, trainLabels, epochs=10, batchSize=50)
# Plot confusion matrix
""" ==========
YOUR CODE HERE
========== """
M_ran,acc_ran = Confusion(testData, testLabels, mlpClassifer)
VisualizeConfusion(M_ran)
print('\n Accuracy: ', acc_ran)
# Plot filter weights
""" ==========
YOUR CODE HERE
========== """
weights = mlpClassifer.wLayer1.data
weights = np.transpose(weights[:, 0:10])
print(weights.shape)
for i in range (weights.shape[0]):
print("Neuron: ", i)
weight = weights[i, :]
wMax, wMin = weight.max(), weight.min()
weight = (weight-wMin)/(wMax-wMin)
weight = weight.reshape(28,28)
plt.imshow(weight, cmap = 'inferno')
plt.show()
The weights do not much resemble those of the linear classifier. In fact they do not look like visuals of the numbers at all. This is expected as this is a multilayered perceptron, and only the first 10 out of 100 weights are being displayed. As a result the weights will be more intricate than the linear classifier and thus no resemble the numbers as much.
Here you will implement a CNN with the following architecture:
So, 2 convolutional layers, followed by 1 fully connected hidden layer and then the output layer
Display the confusion matrix and accuracy after training. You should get around ~ 98 % accuracy for 10 epochs and batch size 50.
Note: You are not allowed to use torch.nn.Conv2d() and torch.nn.Linear(), Using these will lead to deduction of points. Use the declared conv2d(), weight_variable() and bias_variable() functions. Although, in practice, when you move forward after this class you will use torch.nn.Conv2d() which makes life easier and hides all the operations underneath.
def conv2d(x, W, stride, bias=None):
# x: input
# W: weights (out, in, kH, kW)
return F.conv2d(x, W, bias, stride=stride, padding=2)
# Defining a Convolutional Neural Network
class CNNClassifer(DNN):
def __init__(self, classes=10, n=5):
super(CNNClassifer, self).__init__()
""" ==========
YOUR CODE HERE
========== """
self.n = n
self.numClasses = classes
self.hidden = 64
# need 2 conv weights, 2 conv bias, 2 linear weights, 2 linear bias
self.convWeights1 = weight_variable((n,1,5,5))
self.convWeights2 = weight_variable((2*n, n, 5,5))
self.convBias1 = bias_variable((1, n, 1,1))
self.convBias2 = bias_variable((1, 2*n, 1,1))
self.linearWeights1 = weight_variable(( (2*n)*(2+n)*(n+2), self.hidden ))
self.linearWeights2 = weight_variable(( self.hidden, self.numClasses ))
self.linearBias1 = bias_variable(( self.hidden ))
self.linearBias2 = bias_variable(( self.numClasses ))
def forward(self, x):
""" ==========
YOUR CODE HERE
========== """
x1 = conv2d(x,self.convWeights1, 2)+self.convBias1
x1 = F.relu(x1)
x1 = conv2d(x1,self.convWeights2, 2)+self.convBias2
#print(x1.shape)
x1 = F.relu(x1)
sz = x1.shape
x1 = x1.view(sz[0],-1)
#x1 = x1.reshape(sz[0],self.numClasses*sz[2]*sz[3])
#print(x1.shape)
#x1= x1.reasdfd
x1 = x1@self.linearWeights1 +self.linearBias1
x1 = F.relu(x1)
x1 = x1@self.linearWeights2 +self.linearBias2
self.y = x1
return self.y
cnnClassifer = CNNClassifer()
cnnClassifer.train_net(trainData, trainLabels, epochs=10)
# Plot confusion matrix
""" ==========
YOUR CODE HERE
========== """
M_ran,acc_ran = Confusion(testData, testLabels, cnnClassifer)
VisualizeConfusion(M_ran)
print('\n Accuracy: ', acc_ran)